The notebook uses clean, processes data from Step 3 - Data Augmentation Python notebook ** Load Key Libraries **
library(magrittr)
library(HDclassif)
Loading required package: MASS
library(psych)
library(cluster)
library(ggplot2)
Attaching package: ‘ggplot2’
The following objects are masked from ‘package:psych’:
%+%, alpha
library(tidyverse)
Registered S3 methods overwritten by 'dbplyr':
method from
print.tbl_lazy
print.tbl_sql
── Attaching packages ─────────────────────────────────────────── tidyverse 1.3.2 ──✔ tibble 3.1.8 ✔ dplyr 1.0.10
✔ tidyr 1.2.1 ✔ stringr 1.4.1
✔ readr 2.1.3 ✔ forcats 0.5.2
✔ purrr 0.3.5 ── Conflicts ────────────────────────────────────────────── tidyverse_conflicts() ──
✖ ggplot2::%+%() masks psych::%+%()
✖ ggplot2::alpha() masks psych::alpha()
✖ tidyr::extract() masks magrittr::extract()
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
✖ dplyr::select() masks MASS::select()
✖ purrr::set_names() masks magrittr::set_names()
library(FactoMineR)
Registered S3 methods overwritten by 'htmltools':
method from
print.html tools:rstudio
print.shiny.tag tools:rstudio
print.shiny.tag.list tools:rstudio
Registered S3 method overwritten by 'htmlwidgets':
method from
print.htmlwidget tools:rstudio
options(scipen=999)
#run_pca <- function(data_frame, pca_type, components)
dfl_famd <- run_pca(dfz_FAMD, "FAMD", 22)
meth_list <- list("ward.D", "centroid","median")#, "average", "median", "centroid")
dist_list <- list("minkowski","maximum")#,"canberra")
bclist <- list()
# loop
for (m in meth_list) {
for (d in dist_list){
best_cluster <- fun_nc(dfl_famd, d, 2, 5, m)
#bclist <- c(bclist, best_cluster)
bclist[[length(bclist) + 1]] <- best_cluster
}
}
*** : The Hubert index is a graphical method of determining the number of clusters.
In the plot of Hubert index, we seek a significant knee that corresponds to a
significant increase of the value of the measure i.e the significant peak in Hubert
index second differences plot.
*** : The D index is a graphical method of determining the number of clusters.
In the plot of D index, we seek a significant knee (the significant peak in Dindex
second differences plot) that corresponds to a significant increase of the value of
the measure.
*******************************************************************
* Among all indices:
* 7 proposed 2 as the best number of clusters
* 3 proposed 3 as the best number of clusters
* 8 proposed 4 as the best number of clusters
* 3 proposed 5 as the best number of clusters
***** Conclusion *****
* According to the majority rule, the best number of clusters is 4
*******************************************************************
*** : The Hubert index is a graphical method of determining the number of clusters.
In the plot of Hubert index, we seek a significant knee that corresponds to a
significant increase of the value of the measure i.e the significant peak in Hubert
index second differences plot.
*** : The D index is a graphical method of determining the number of clusters.
In the plot of D index, we seek a significant knee (the significant peak in Dindex
second differences plot) that corresponds to a significant increase of the value of
the measure.
*******************************************************************
* Among all indices:
* 7 proposed 2 as the best number of clusters
* 10 proposed 3 as the best number of clusters
* 2 proposed 4 as the best number of clusters
* 2 proposed 5 as the best number of clusters
***** Conclusion *****
* According to the majority rule, the best number of clusters is 3
*******************************************************************
[1] "Frey index : No clustering structure in this data set"
*** : The Hubert index is a graphical method of determining the number of clusters.
In the plot of Hubert index, we seek a significant knee that corresponds to a
significant increase of the value of the measure i.e the significant peak in Hubert
index second differences plot.
*** : The D index is a graphical method of determining the number of clusters.
In the plot of D index, we seek a significant knee (the significant peak in Dindex
second differences plot) that corresponds to a significant increase of the value of
the measure.
*******************************************************************
* Among all indices:
* 8 proposed 2 as the best number of clusters
* 11 proposed 3 as the best number of clusters
* 2 proposed 4 as the best number of clusters
* 2 proposed 5 as the best number of clusters
***** Conclusion *****
* According to the majority rule, the best number of clusters is 3
*******************************************************************
[1] "Frey index : No clustering structure in this data set"
*** : The Hubert index is a graphical method of determining the number of clusters.
In the plot of Hubert index, we seek a significant knee that corresponds to a
significant increase of the value of the measure i.e the significant peak in Hubert
index second differences plot.
*** : The D index is a graphical method of determining the number of clusters.
In the plot of D index, we seek a significant knee (the significant peak in Dindex
second differences plot) that corresponds to a significant increase of the value of
the measure.
*******************************************************************
* Among all indices:
* 9 proposed 2 as the best number of clusters
* 7 proposed 3 as the best number of clusters
* 3 proposed 4 as the best number of clusters
* 4 proposed 5 as the best number of clusters
***** Conclusion *****
* According to the majority rule, the best number of clusters is 2
*******************************************************************
[1] "Frey index : No clustering structure in this data set"
*** : The Hubert index is a graphical method of determining the number of clusters.
In the plot of Hubert index, we seek a significant knee that corresponds to a
significant increase of the value of the measure i.e the significant peak in Hubert
index second differences plot.
*** : The D index is a graphical method of determining the number of clusters.
In the plot of D index, we seek a significant knee (the significant peak in Dindex
second differences plot) that corresponds to a significant increase of the value of
the measure.
*******************************************************************
* Among all indices:
* 8 proposed 2 as the best number of clusters
* 13 proposed 3 as the best number of clusters
* 2 proposed 5 as the best number of clusters
***** Conclusion *****
* According to the majority rule, the best number of clusters is 3
*******************************************************************
[1] "Frey index : No clustering structure in this data set"
*** : The Hubert index is a graphical method of determining the number of clusters.
In the plot of Hubert index, we seek a significant knee that corresponds to a
significant increase of the value of the measure i.e the significant peak in Hubert
index second differences plot.
*** : The D index is a graphical method of determining the number of clusters.
In the plot of D index, we seek a significant knee (the significant peak in Dindex
second differences plot) that corresponds to a significant increase of the value of
the measure.
*******************************************************************
* Among all indices:
* 9 proposed 2 as the best number of clusters
* 6 proposed 3 as the best number of clusters
* 5 proposed 4 as the best number of clusters
* 3 proposed 5 as the best number of clusters
***** Conclusion *****
* According to the majority rule, the best number of clusters is 2
*******************************************************************
Step 4: Execute the cluserting using dfl_FAMD
#fun_clust <- function(data_frame, distance, agglo_method, clusters)
famd_min_cen_c <- fun_clust(dfl_famd, "minkowski", "ward.D", 4)
Evaluate Clusters
table(famd_min_cen_c)
famd_min_cen_c
1 2 3 4
1109 1806 1265 411
tmp_data <- zdata
tmp_data$cluster <-famd_min_cen_c
tmp_data$cluster <- as.factor(tmp_data$cluster)
p <- ggplot(tmp_data, aes(x=cluster, y=rentZestimate)) +
geom_boxplot()
p
q <- ggplot(tmp_data, aes(x=cluster, y=Sch_Rat_Avg)) +
geom_boxplot()
q
p <- ggplot(tmp_data, aes(x=cluster, y=Income_per_return)) +
geom_boxplot()
p
q <- ggplot(tmp_data, aes(x=cluster, y=violent_crime_total_rate)) +
geom_boxplot()
q
tmp_data$ClusterCategory <- ifelse(tmp_data$cluster == 1, "Cluster 1",
ifelse(tmp_data$cluster == 2, "Cluster 2",
ifelse(tmp_data$cluster == 3, "Cluster 3",
ifelse(tmp_data$cluster == 4, "Cluster 4",
"Cluster 5"))))
write.csv(tmp_data,"cluster_output4.csv", row.names = FALSE)
Remove the temporary dataset
rm(tmp_data)
gower_dist <- cluster::daisy(dfl_famd, metric = "gower")
set.seed(123)
pam_cluster <- cluster::pam(gower_dist, k = 3)
table(pam_cluster$clustering)
1 2 3
1720 1593 1278
tmp_data <- zdata
tmp_data$cluster <-pam_cluster$clustering
tmp_data$cluster <- as.factor(tmp_data$cluster)
p <- ggplot(tmp_data, aes(x=cluster, y=livingArea)) +
geom_boxplot()
p
q <- ggplot(tmp_data, aes(x=cluster, y=Sch_Rat_Avg)) +
geom_boxplot()
q
p <- ggplot(tmp_data, aes(x=cluster, y=Income_per_return)) +
geom_boxplot()
p
q <- ggplot(tmp_data, aes(x=cluster, y=violent_crime_total_rate)) +
geom_boxplot()
q
*** End of Part 2 ***